Missing Data Patterns in Humanitarian Security Incidents
Code
# Load datasetdf = pd.read_csv("data/security_incidents_cleaned.csv")# Countries of interestcountries = ["Occupied Palestinian Territories", "Afghanistan", "DR Congo","Syrian Arab Republic", "Somalia", "Mali", "South Sudan", "Sudan"]# Filter data for those countriesdf_filtered = df[df['country'].isin(countries)]# Calculate total incidents per country for normalizationcountry_totals = df_filtered.groupby('country').size().to_dict()
1 1. Bar Chart of Unknown Values by Column and Country
Code
# Your visualization code here# Columns to analyzecolumns_to_check = ["means_of_attack", "attack_context", "location","motive", "actor_type", "actor_name"]# Calculate total incidents per country for normalizationcountry_totals = df_filtered.groupby('country').size()# Initialize a DataFrame to store results - with explicit float dtyperesults = pd.DataFrame(index=countries, columns=columns_to_check, dtype=float)# Count "Unknown" values in each column for each countryvalue ="Unknown"# Change this to analyze a different valuefor country in countries: country_data = df_filtered[df_filtered['country'] == country] country_total =len(country_data)if country_total >0:for col in columns_to_check:# Count occurrences and calculate percentage count = country_data[country_data[col] == value].shape[0] percentage = (count / country_total) *100 results.loc[country, col] = percentageelse:for col in columns_to_check: results.loc[country, col] =0.0# Calculate the average percentage across all columnsresults['average'] = results[columns_to_check].mean(axis=1)# Sort by average (descending)results = results.sort_values('average', ascending=False)# Create the bar chartplt.figure(figsize=(14, 8))# Plot grouped bars for each columnresults[columns_to_check].plot( kind='bar', ax=plt.gca(), width=0.8)# Add a line for the averageplt.plot(range(len(results)), results['average'],'ko-', linewidth=2, markersize=8, label='Average')# Calculate total counts of Unknown valuesunknown_counts = {}for country in countries: country_data = df_filtered[df_filtered['country'] == country] unknown_total = country_data[columns_to_check].apply(lambda row: (row == value).sum(), axis=1 ).sum() unknown_counts[country] = unknown_total# Add annotations for averagesfor i, (country, avg) inenumerate(results['average'].items()): plt.annotate(f"{avg:.1f}%\n({unknown_counts[country]} occurrences)", xy=(i, avg), xytext=(0, 10), textcoords='offset points', ha='center', va='bottom', fontweight='bold', bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8) )plt.title(f"Percentage of '{value}' Values Across All Columns by Country")plt.xlabel("Country")plt.ylabel(f"Percentage of '{value}' Values")plt.legend(title="Column", bbox_to_anchor=(1.01, 1), loc='upper left')plt.xticks(rotation=45, ha='right')plt.grid(axis='y', alpha=0.3)plt.tight_layout()plt.show()
2 2. Heatmap of Unknown Values by Column and Country
Code
# Columns to analyzecolumns_to_check = ["means_of_attack", "attack_context", "location","motive", "actor_type", "actor_name"]# Calculate total incidents per country for normalizationcountry_totals = df_filtered.groupby('country').size()# Initialize a DataFrame to store results - with explicit float dtypeheatmap_data = pd.DataFrame(index=countries, columns=columns_to_check, dtype=float)# Count "Unknown" values in each column for each countryvalue ="Unknown"# Change this to analyze a different valuefor country in countries: country_data = df_filtered[df_filtered['country'] == country] country_total =len(country_data)if country_total >0:for col in columns_to_check:# Count occurrences and calculate percentage count = country_data[country_data[col] == value].shape[0] percentage = (count / country_total) *100 heatmap_data.loc[country, col] = percentageelse:for col in columns_to_check: heatmap_data.loc[country, col] =0.0# Calculate average for sortingheatmap_data['average'] = heatmap_data[columns_to_check].mean(axis=1)heatmap_data = heatmap_data.sort_values('average', ascending=False)# Create heatmap visualization (without the average column)plt.figure(figsize=(14, 8))sns.heatmap( heatmap_data[columns_to_check], annot=True, fmt=".1f", cmap="YlOrRd", linewidths=0.5, cbar_kws={'label': '% Unknown'})plt.title("Percentage of 'Unknown' Values by Country and Column")plt.tight_layout()plt.show()
3 3. Data Completeness Stacked Bar Chart
Code
# Columns to analyzecolumns_to_check = ["means_of_attack", "attack_context", "location","motive", "actor_type", "actor_name"]# Count 'Unknown' values per rowdf_filtered["unknown_count"] = df_filtered[columns_to_check].apply(lambda row: (row =="Unknown").sum(), axis=1)# Calculate total unknown values per countrycountry_unknown = df_filtered.groupby('country')['unknown_count'].sum()# Calculate total possible unknown values (# rows * # columns)country_total_possible = df_filtered.groupby('country').size() *len(columns_to_check)# Calculate percentagesunknown_pct = (country_unknown / country_total_possible *100).reindex(countries)known_pct =100- unknown_pct# Sort by percentage of known values (ascending)sorted_countries = known_pct.sort_values().index# Create DataFrame for plottingstacked_data = pd.DataFrame({'country': sorted_countries,'Known': known_pct[sorted_countries].values,'Unknown': unknown_pct[sorted_countries].values})# Reshape for plottingplot_data = pd.melt( stacked_data, id_vars=['country'], value_vars=['Known', 'Unknown'], var_name='Data Status', value_name='Percentage')# Plotplt.figure(figsize=(12, 7))ax = sns.barplot( data=plot_data, x='Percentage', y='country', hue='Data Status', palette=['#1D70B8', '#F2645A'] # Blue for Known, Red for Unknown)# Add percentage labelsfor i, country inenumerate(sorted_countries): known = known_pct[country] plt.text(50, i, f"{known:.1f}% Complete", ha='center', va='center', color='white', fontweight='bold' )plt.title("Data Completeness by Country (%)")plt.xlabel("Percentage")plt.ylabel("Country")plt.legend(title='', loc='lower right')plt.tight_layout()plt.show()
4 4. Distribution of Unknown Fields per Record
Code
# Columns to analyzecolumns_to_check = ["means_of_attack", "attack_context", "location","motive", "actor_type", "actor_name"]# Count 'Unknown' values per row if not already doneif"unknown_count"notin df_filtered.columns: df_filtered["unknown_count"] = df_filtered[columns_to_check].apply(lambda row: (row =="Unknown").sum(), axis=1 )# Get the distribution of records with each number of unknown fields (as percentages)unknown_dist = pd.crosstab( index=df_filtered['country'], columns=df_filtered['unknown_count'], normalize='index') *100# Sort countries by average number of unknown fieldsavg_unknown = df_filtered.groupby('country')['unknown_count'].mean().sort_values(ascending=False)unknown_dist = unknown_dist.reindex(avg_unknown.index)# Plot stacked barsplt.figure(figsize=(14, 8))unknown_dist.plot( kind='barh', stacked=True, figsize=(14, 8), cmap='YlOrRd', width=0.8)# Add text for average unknown countfor i, country inenumerate(unknown_dist.index): avg = avg_unknown[country] plt.text(101, i, f"Avg: {avg:.1f} fields", va='center', fontsize=10, fontweight='bold' )plt.title("Distribution of Records by Number of Unknown Fields")plt.xlabel("Percentage of Records")plt.ylabel("Country")plt.legend( title='Number of Unknown Fields', bbox_to_anchor=(1.01, 1), loc='upper left')plt.xlim(0, 120) # Leave room for annotationsplt.grid(False)plt.tight_layout()plt.show()
<Figure size 4200x2400 with 0 Axes>
5 5. Total Missing Data by Country
Code
# Columns to analyzecolumns_to_check = ["means_of_attack", "attack_context", "location","motive", "actor_type", "actor_name"]# Count 'Unknown' values per row if not already doneif"unknown_count"notin df_filtered.columns: df_filtered["unknown_count"] = df_filtered[columns_to_check].apply(lambda row: (row =="Unknown").sum(), axis=1 )# Calculate total and percentage of unknown values by countrycountry_stats = []for country in countries: country_data = df_filtered[df_filtered['country'] == country] total_incidents =len(country_data)if total_incidents >0: total_unknown = country_data['unknown_count'].sum() total_possible = total_incidents *len(columns_to_check) pct_unknown = (total_unknown / total_possible) *100 country_stats.append({'country': country,'total_incidents': total_incidents,'total_unknown': total_unknown,'percent_unknown': pct_unknown,'avg_unknown_per_record': total_unknown / total_incidents })# Convert to DataFrame and sortstats_df = pd.DataFrame(country_stats)stats_df = stats_df.sort_values('percent_unknown', ascending=False)# Create bar chartplt.figure(figsize=(12, 7))bars = plt.barh( stats_df['country'], stats_df['percent_unknown'], color='#F2645A', alpha=0.8)# Add text labelsfor i, row inenumerate(stats_df.itertuples()): plt.text( row.percent_unknown +0.5, i, f"{row.percent_unknown:.1f}% ({row.total_unknown}/{row.total_incidents*len(columns_to_check)})", va='center' )plt.title("Overall Percentage of Unknown Values by Country")plt.xlabel("Percentage of Unknown Values")plt.ylabel("Country")plt.grid(axis='x', alpha=0.3)plt.tight_layout()plt.show()